Set up

suppressPackageStartupMessages({
  library(tidyverse)
})

Directories and File Inputs/Outputs

# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions 
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
results_dir <- file.path(analysis_dir, "results")
input_dir <- file.path(analysis_dir, "input")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")

# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv")
tmb_vaf_file <- file.path(results_dir, "tmb_vaf_genomic.tsv")
palette_file <- file.path(root_dir, "figures", "palettes", "oncoprint_color_palette.tsv")

# File path to plot directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(root_dir, "/figures/scripts/theme.R"))
source(paste0(analysis_dir, "/util/function-create-barplot.R"))

Read in data and process

pbta_df <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>% 
  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_multiple, cg_id, cgGFAC, tumor_descriptor)

tmb_vaf_df <- readr::read_tsv(tmb_vaf_file, guess_max = 100000, show_col_types = FALSE) %>% 
  filter(!tmb >= 10) %>% 
  select(Kids_First_Biospecimen_ID, Variant_Classification, gene_protein, mutation_count,   region_size, tmb, VAF)

genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
  left_join(pbta_df, by = c("Kids_First_Participant_ID")) %>% 
  left_join(tmb_vaf_df, by = c("Kids_First_Biospecimen_ID")) %>%
  filter(!is.na(tmb))

# Attention as some bs specimen might not have TMB!
# If that happens, we will end up with samples lacking timepoints.

# Which patient samples don't have TMB?
# genomic_paired_df %>% 
#  filter(is.na(tmb)) %>% 
#  unique() %>% 
#  regulartable() %>%
#  fontsize(size = 12, part = "all")

descriptors_df <- genomic_paired_df %>%
  group_by(Kids_First_Participant_ID) %>%
  summarize(descriptors = paste(sort(tumor_descriptor), collapse = ", "),) 

# Vector to order timepoints
timepoints <- c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable")

# Create df to use for plots
df_plot <- genomic_paired_df %>% 
  left_join(descriptors_df, by = c("Kids_First_Participant_ID", "descriptors")) %>% 
  mutate(td_cgGFAC = case_when(grepl("Deceased", tumor_descriptor) ~ "xDeceased",
                               TRUE ~ tumor_descriptor),
         match_id_bs = paste(tumor_descriptor, Kids_First_Biospecimen_ID, sep = "_"),
         cg_id_kids = paste(cg_id, Kids_First_Participant_ID, sep = "_"),
         cg_id_kids = str_replace(cg_id_kids, "/", "_"),
         cg_id_kids = str_replace(cg_id_kids, "-", "_"),
         cg_id_kids = str_replace_all(cg_id_kids, " ", "_"),
         tumor_descriptor = factor(tumor_descriptor),
         tumor_descriptor = fct_relevel(tumor_descriptor, timepoints))

# Let's count number of samples 
count_df <- df_plot %>% 
  group_by(tumor_descriptor, cg_id, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(cg_id) 

Define parameters for plots

# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) 

# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$color_names

Alterations per timepoint

# Define parameters for function
x_value <- count_df$tumor_descriptor
title <- paste("Variant types in PBTA cohort", sep = " ")

# Run function
fname <- paste0(plots_dir, "/", "Alteration_type_timepoints_barplots.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Alteration_type_timepoints_barplots.pdf"
p <- create_stacked_barplot_variant(count_df = count_df, x = x_value, palette = palette, title = title)
pdf(file = fname, width = 6, height = 6)
print(p)
dev.off()
quartz_off_screen 
                2 

Alterations per timepoint in each cancer type

cg_id_id <- as.character(unique(count_df$cg_id)) 
cg_id_id <- sort(cg_id_id, decreasing = FALSE)
cg_id_id
 [1] "Adamantinomatous Craniopharyngioma"       "Atypical Teratoid Rhabdoid Tumor"         "Chordoma"                                
 [4] "Choroid plexus carcinoma"                 "CNS Embryonal tumor"                      "Craniopharyngioma"                       
 [7] "Diffuse midline glioma"                   "Dysembryoplastic neuroepithelial tumor"   "Embryonal tumor with multilayer rosettes"
[10] "Ependymoma"                               "Ewing sarcoma"                            "Ganglioglioma"                           
[13] "Glial-neuronal tumor"                     "Hemangioblastoma"                         "High-grade glioma"                       
[16] "Low-grade glioma"                         "Malignant peripheral nerve sheath tumor"  "Medulloblastoma"                         
[19] "Meningioma"                               "Neuroblastoma"                            "Neurofibroma/Plexiform"                  
[22] "Pilocytic astrocytoma"                    "Rosai-Dorfman disease"                    "Schwannoma"                              
# Define parameters for function
x_value <- count_df$match_id
title <- paste("Variant types in PBTA cohort across cancer groups", sep = " ")

# Run function
fname <- paste0(plots_dir, "/", "Alteration_type_timepoints_cg_id_barplots.pdf")
print(fname)
[1] "/Users/chronia/CHOP/GitHub/pbta-tumor-evolution/analyses/tmb-vaf-longitudinal/plots/Alteration_type_timepoints_cg_id_barplots.pdf"
p <- create_stacked_barplot_variant_cg_id(count_df = count_df, x = x_value, palette = palette, title = title)
pdf(file = fname, width = 30, height = 18)
print(p)
dev.off()
quartz_off_screen 
                2 

Alterations per timepoint in each cancer type defined by cgGFAC

df_plot_cgGFAC <- df_plot %>%
  filter(!is.na(timepoints_models)) %>% 
  arrange(timepoint_cgGFAC_n) %>% 
  group_by(tumor_descriptor, cgGFAC, timepoint_cgGFAC_n, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(timepoint_cgGFAC_n)

cgGFAC_id <- as.character(unique(df_plot_cgGFAC$cgGFAC)) 
cgGFAC_id <- sort(cgGFAC_id, decreasing = FALSE)
cgGFAC_id
[1] "ATRT"  "DMG"   "HGG"   "LGG"   "Other"
# Define parameters for function
x_value <- df_plot_cgGFAC$Kids_First_Biospecimen_ID
title <- paste("Variant types in PBTA cohort across cgGFAC", sep = " ")

# Run function
p <- create_stacked_barplot_variant_cgGFAC(count_df = df_plot_cgGFAC, x = x_value, palette = palette, title = title)

Alterations per timepoint in each cancer type and timepoint model

tm_df_plot <- df_plot %>%
  filter(!is.na(timepoints_models)) %>% 
  group_by(tumor_descriptor, cg_id, timepoints_models, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(timepoint_cgGFAC_n)

tm <- as.character(unique(tm_df_plot$timepoints_models))
tm <- sort(tm, decreasing = FALSE)
tm
 [1] "Dx-Dec"         "Dx-Pro"         "Dx-Pro-Dec"     "Dx-Pro-Rec"     "Dx-Pro-Rec-Dec" "Dx-Rec"         "Dx-Rec-Dec"     "Dx-SM"         
 [9] "Pro-Dec"        "Pro-Rec"        "Pro-Rec-Dec"    "Rec-Dec"        "Rec-SM"        
# Loop through variable
for (i in seq_along(tm)){
  print(i)
  df_sub <- tm_df_plot %>%
      filter(timepoints_models == tm[i])
  
   # Define parameters for function
  x_value <- df_sub$Kids_First_Biospecimen_ID
  title <- paste(tm[i])
  
  # Run function
  p <- create_stacked_barplot_variant_cg_id(count_df = df_sub, x = x_value, palette = palette, title = title)
  
}
[1] 1
[1] 2
[1] 3
[1] 4
[1] 5
[1] 6
[1] 7
[1] 8
[1] 9
[1] 10
[1] 11
[1] 12
[1] 13

sessionInfo()
---
title: "Classification of Variants across paired longitudinal samples in the PBTA Cohort"
author: 'Antonia Chroni <chronia@chop.edu> for D3B'
date: "2023"
output:
  html_notebook:
    toc: TRUE
    toc_float: TRUE
---

# Set up
```{r load-library}
suppressPackageStartupMessages({
  library(tidyverse)
})
```

# Directories and File Inputs/Outputs
```{r set-dir-and-file-names}
# Detect the ".git" folder -- this will be in the project root directory
# Use this as the root directory to ensure proper sourcing of functions 
# no matter where this is called from
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
analysis_dir <- file.path(root_dir, "analyses", "tmb-vaf-longitudinal")
results_dir <- file.path(analysis_dir, "results")
input_dir <- file.path(analysis_dir, "input")
files_dir <- file.path(root_dir, "analyses", "sample-distribution-analysis", "results")

# Input files
pbta_file <- file.path(files_dir, "pbta.tsv") # file from add-sample-distribution module
genomic_paired_file <- file.path(files_dir, "genomic_assays_matched_time_points.tsv")
tmb_vaf_file <- file.path(results_dir, "tmb_vaf_genomic.tsv")
palette_file <- file.path(root_dir, "figures", "palettes", "oncoprint_color_palette.tsv")

# File path to plot directory
plots_dir <-
  file.path(analysis_dir, "plots")
if (!dir.exists(plots_dir)) {
  dir.create(plots_dir)
}

source(paste0(root_dir, "/figures/scripts/theme.R"))
source(paste0(analysis_dir, "/util/function-create-barplot.R"))
```

# Read in data and process

```{r load-process-inputs}
pbta_df <- readr::read_tsv(pbta_file, guess_max = 100000, show_col_types = FALSE) %>% 
  select(Kids_First_Participant_ID, Kids_First_Biospecimen_ID, cg_multiple, cg_id, cgGFAC, tumor_descriptor)

tmb_vaf_df <- readr::read_tsv(tmb_vaf_file, guess_max = 100000, show_col_types = FALSE) %>% 
  filter(!tmb >= 10) %>% 
  select(Kids_First_Biospecimen_ID, Variant_Classification, gene_protein, mutation_count,	region_size, tmb, VAF)

genomic_paired_df <- readr::read_tsv(genomic_paired_file, guess_max = 100000, show_col_types = FALSE) %>%
  left_join(pbta_df, by = c("Kids_First_Participant_ID")) %>% 
  left_join(tmb_vaf_df, by = c("Kids_First_Biospecimen_ID")) %>%
  filter(!is.na(tmb))

# Attention as some bs specimen might not have TMB!
# If that happens, we will end up with samples lacking timepoints.

# Which patient samples don't have TMB?
# genomic_paired_df %>% 
#  filter(is.na(tmb)) %>% 
#  unique() %>% 
#  regulartable() %>%
#  fontsize(size = 12, part = "all")

descriptors_df <- genomic_paired_df %>%
  group_by(Kids_First_Participant_ID) %>%
  summarize(descriptors = paste(sort(tumor_descriptor), collapse = ", "),) 

# Vector to order timepoints
timepoints <- c("Diagnosis", "Progressive", "Recurrence", "Deceased", "Second Malignancy", "Unavailable")

# Create df to use for plots
df_plot <- genomic_paired_df %>% 
  left_join(descriptors_df, by = c("Kids_First_Participant_ID", "descriptors")) %>% 
  mutate(td_cgGFAC = case_when(grepl("Deceased", tumor_descriptor) ~ "xDeceased",
                               TRUE ~ tumor_descriptor),
         match_id = paste(tumor_descriptor, Kids_First_Participant_ID, sep = "_"),
         tumor_descriptor = factor(tumor_descriptor),
         tumor_descriptor = fct_relevel(tumor_descriptor, timepoints))

# Let's count number of samples 
count_df <- df_plot %>% 
  group_by(tumor_descriptor, cg_id, match_id, Variant_Classification) %>% 
  dplyr::count(cg_id) 

``` 

# Define parameters for plots

```{r define-parameters-for-plots}
# Read color palette
palette_df <- readr::read_tsv(palette_file, guess_max = 100000, show_col_types = FALSE) 

# Define and order palette
palette <- palette_df$hex_codes
names(palette) <- palette_df$color_names
```

# Alterations per timepoint

```{r plot-timepoint, fig.width = 6, fig.height = 6, fig.fullwidth = TRUE}
# Define parameters for function
x_value <- count_df$tumor_descriptor
title <- paste("Variant types in PBTA cohort", sep = " ")

# Run function
fname <- paste0(plots_dir, "/", "Alteration_type_timepoints_barplots.pdf")
print(fname)
p <- create_stacked_barplot_variant(count_df = count_df, x = x_value, palette = palette, title = title)
pdf(file = fname, width = 6, height = 6)
print(p)
dev.off()
```

# Alterations per timepoint in each cancer type

```{r plot-cg-id, fig.width = 30, fig.height = 18, fig.fullwidth = TRUE}
cg_id_id <- as.character(unique(count_df$cg_id)) 
cg_id_id <- sort(cg_id_id, decreasing = FALSE)
cg_id_id

# Define parameters for function
x_value <- count_df$match_id
title <- paste("Variant types in PBTA cohort across cancer groups", sep = " ")

# Run function
fname <- paste0(plots_dir, "/", "Alteration_type_timepoints_cg_id_barplots.pdf")
print(fname)
p <- create_stacked_barplot_variant_cg_id(count_df = count_df, x = x_value, palette = palette, title = title)
pdf(file = fname, width = 30, height = 18)
print(p)
dev.off()

```

# Alterations per timepoint in each cancer type defined by cgGFAC

```{r plot-cgGFAC-n-individual-plots, fig.width = 5, fig.height = 5, fig.fullwidth = TRUE}
df_plot_cgGFAC <- df_plot %>%
  filter(!is.na(timepoints_models)) %>% 
  arrange(timepoint_cgGFAC_n) %>% 
  group_by(tumor_descriptor, cgGFAC, timepoint_cgGFAC_n, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(timepoint_cgGFAC_n)

cgGFAC_id <- as.character(unique(df_plot_cgGFAC$cgGFAC)) 
cgGFAC_id <- sort(cgGFAC_id, decreasing = FALSE)
cgGFAC_id

# Define parameters for function
x_value <- df_plot_cgGFAC$match_id
title <- paste("Variant types in PBTA cohort across cgGFAC", sep = " ")

# Run function
p <- create_stacked_barplot_variant_cgGFAC(count_df = df_plot_cgGFAC, x = x_value, palette = palette, title = title)

```

# Alterations per timepoint in each cancer type and timepoint model

```{r plot-timepoint-model, fig.width = 15, fig.height = 12, fig.fullwidth = TRUE}
tm_df_plot <- df_plot %>%
  filter(!is.na(timepoints_models)) %>% 
  group_by(tumor_descriptor, cg_id, timepoints_models, Kids_First_Biospecimen_ID, Variant_Classification) %>% 
  dplyr::count(timepoint_cgGFAC_n)

tm <- as.character(unique(tm_df_plot$timepoints_models))
tm <- sort(tm, decreasing = FALSE)
tm

# Loop through variable
for (i in seq_along(tm)){
  print(i)
  df_sub <- tm_df_plot %>%
      filter(timepoints_models == tm[i])
  
   # Define parameters for function
  x_value <- df_sub$match_id
  title <- paste(tm[i])
  
  # Run function
  p <- create_stacked_barplot_variant_cg_id(count_df = df_sub, x = x_value, palette = palette, title = title)
  
}
```


```{r echo=TRUE}
sessionInfo()
```
